Data Link: https://www.dropbox.com/sh/7fo4efxhpenexqp/AACmuri_l-LDiVDUDJ3hVLqPa?dl=0
This tutorial is based on one from ClickSecurity's Data_hacking repo: https://github.com/carriegardner428/data_hacking/blob/master/contagio_traffic_analysis/contagio_traffic_analysis.ipynb
This dataset represents samples of malicious network traffic from {}. Samples are classified into "APT", "CRIME", and "METASPLOIT", and are further divided into sample grouping
In [30]:
    
import os
import pandas as pd
from datetime import datetime
    
In [58]:
    
# Mapping of fields of the files we want to read in and initial setup of pandas dataframes
logs_to_process = {
                    'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','threat','sample'],
                    'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','threat','sample'],
                    'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','threat','sample'],
                    'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','threat','sample'],
                    'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','threat','sample'],
                    'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','threat','sample'],
                    'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','threat','sample'],
                    'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','threat','sample'],
                    'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','threat','sample'],
                    'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','threat','sample'],
                    'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','threat','sample']
                  }
conndf   = pd.DataFrame(columns=logs_to_process['conn.log'])
dnsdf    = pd.DataFrame(columns=logs_to_process['dns.log'])
filesdf  = pd.DataFrame(columns=logs_to_process['files.log'])
ftpdf    = pd.DataFrame(columns=logs_to_process['ftp.log'])
httpdf   = pd.DataFrame(columns=logs_to_process['http.log'])
noticedf = pd.DataFrame(columns=logs_to_process['notice.log'])
sigdf    = pd.DataFrame(columns=logs_to_process['signatures.log'])
smtpdf   = pd.DataFrame(columns=logs_to_process['smtp.log'])
ssldf    = pd.DataFrame(columns=logs_to_process['ssl.log'])
tunneldf = pd.DataFrame(columns=logs_to_process['tunnel.log'])
weirddf  = pd.DataFrame(columns=logs_to_process['weird.log'])
    
In [59]:
    
def clean_timestamp(df):
    # Reference: https://github.com/carriegardner428/data_hacking/blob/master/contagio_traffic_analysis/contagio_traffic_analysis.ipynb
    # DROP RECORDS THAT DON'T HAVE A TIMESTAMP ('#CLOSE')
    df = df[df.ts.str.contains("#close") == False]
    df_time = [datetime.fromtimestamp(float(date)) for date in df['ts'].values ]
    df['timestamp'] = pd.Series(df_time)
    df.drop('ts', axis=1, inplace=True)
    df.set_index('timestamp', inplace=True)
    return df
    
In [60]:
    
for dirName, subdirList, fileList in os.walk('./data/PCAPS_TRAFFIC_PATTERNS/'):
    for fname in fileList:
        tags = dirName.split('/')
        if len(tags) == 5 and fname in logs_to_process:
            logname = fname.split('.')
            try:
                tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None, 
                                     names=logs_to_process[fname][:-2])
                tempdf['threat'] = tags[3]
                tempdf['sample'] = tags[4]
                tempdf = clean_timestamp(tempdf)
                if tags[2] == "0":
                    print ('%s/%s' %(dirName, fname))
                if fname == 'conn.log':
                    conndf = conndf.append(tempdf)
                if fname == 'dns.log':
                    dnsdf = dnsdf.append(tempdf)
                if fname == 'files.log':
                    filesdf = filesdf.append(tempdf)
                if fname == 'ftp.log':
                    ftpdf = ftpdf.append(tempdf)
                if fname == 'http.log':
                    httpdf = httpdf.append(tempdf)
                if fname == 'notice.log':
                    noticedf = noticedf.append(tempdf)
                if fname == 'signatures.log':
                    sigdf = sigdf.append(tempdf)
                if fname == 'smtp.log':
                    smtpdf = smtpdf.append(tempdf)
                if fname == 'ssl.log':
                    ssldf = ssldf.append(tempdf)
                if fname == 'tunnel.log':
                    tunneldf = tunneldf.append(tempdf)
                if fname == 'weird.log':
                    weirddf = weirddf.append(tempdf)
            except Exception as e:
                print("Error: {}, on {}/{}".format(str(e), dirName, fname))
    
    
In [69]:
    
conndf.shape
    
    Out[69]:
In [74]:
    
conndf.sort_index(inplace=True)
conndf.index.year.unique()
    
    Out[74]:
Where computers even around in 1969?
(Yes, LOL ;) ). But this sample is from roughly 2008-2013. Let's make a new df with records just in that time interval
In [68]:
    
conndf = conndf['2008':'2013']
conndf.shape
    
    Out[68]:
you can use df['start_time/date':'end_time/date'] to subset a collection of records in a timeframe
In [75]:
    
conndf.index.year.unique()
    
    Out[75]:
dns.log
In [77]:
    
dnsdf.sort_index(inplace=True)
dnsdf.index.year.unique()
    
    Out[77]:
In [87]:
    
dnsdf.shape
    
    Out[87]:
In [88]:
    
dnsdf = dnsdf['2010':'2013']
dnsdf.shape
    
    Out[88]:
files.log
In [78]:
    
filesdf.sort_index(inplace=True)
filesdf.index.year.unique()
    
    Out[78]:
In [89]:
    
filesdf.shape
    
    Out[89]:
In [90]:
    
filesdf = filesdf['2008':'2013']
filesdf.shape
    
    Out[90]:
ftp.log
In [79]:
    
ftpdf.sort_index(inplace=True)
ftpdf.index.year.unique()
    
    Out[79]:
In [91]:
    
ftpdf.shape
    
    Out[91]:
In [92]:
    
ftpdf = ftpdf['2013']
ftpdf.shape
    
    Out[92]:
http.log
In [80]:
    
httpdf.sort_index(inplace=True)
httpdf.index.year.unique()
    
    Out[80]:
In [93]:
    
httpdf.shape
    
    Out[93]:
In [94]:
    
httpdf = httpdf['2008':'2013']
httpdf.shape
    
    Out[94]:
notice.log
In [81]:
    
noticedf.sort_index(inplace=True)
noticedf.index.year.unique()
    
    Out[81]:
In [95]:
    
noticedf.shape
    
    Out[95]:
In [96]:
    
noticedf = noticedf['2011':'2013']
noticedf.shape
    
    Out[96]:
sig.log
In [82]:
    
sigdf.sort_index(inplace=True)
sigdf.index.year.unique()
    
    Out[82]:
In [97]:
    
sigdf.shape
    
    Out[97]:
smtp.log
In [83]:
    
smtpdf.sort_index(inplace=True)
smtpdf.index.year.unique()
    
    Out[83]:
In [99]:
    
smtpdf.shape
    
    Out[99]:
ssl.log
In [84]:
    
ssldf.sort_index(inplace=True)
ssldf.index.year.unique()
    
    Out[84]:
In [100]:
    
ssldf.shape
    
    Out[100]:
In [101]:
    
ssldf = ssldf['2011':'2013']
ssldf.shape
    
    Out[101]:
tunnel.log
In [85]:
    
tunneldf.sort_index(inplace=True)
tunneldf.index.year.unique()
    
    Out[85]:
In [102]:
    
tunneldf.shape
    
    Out[102]:
weird.log
In [86]:
    
weirddf.sort_index(inplace=True)
weirddf.index.year.unique()
    
    Out[86]:
In [103]:
    
weirddf.shape
    
    Out[103]:
In [105]:
    
weirddf = weirddf['2011':'2013']
weirddf.shape
    
    Out[105]:
In [34]:
    
conndf.head()
    
    Out[34]:
In [106]:
    
conndf.info()
    
    
In [131]:
    
# Get categorical, object-type variables
conndf.select_dtypes(include=['object']).describe()
    
    Out[131]:
In [ ]:
    
conndf['local_orig'].unique()
    
In [135]:
    
conndf.tunnel_parents.value_counts()
    
    Out[135]:
In [136]:
    
conndf.conn_state.value_counts()
    
    Out[136]:
In [134]:
    
# Get categorical, object-type variables
conndf.select_dtypes(exclude=['object']).describe()
    
    Out[134]:
conndf TODOs:
- drop 'ts' column
- drop 'local_orig' column, 1 unique
In [126]:
    
conndf.drop('ts', axis=1, inplace=True)
conndf.drop('local_orig', axis=1, inplace=True)
conndf.info()
    
    
In [15]:
    
dnsdf.head()
    
    Out[15]:
In [137]:
    
dnsdf.info()
    
    
In [138]:
    
# Get categorical, object-type variables
dnsdf.select_dtypes(include=['object']).describe()
    
    Out[138]:
In [139]:
    
dnsdf.rejected.value_counts()
    
    Out[139]:
In [140]:
    
dnsdf.qclass_name.value_counts()
    
    Out[140]:
In [141]:
    
dnsdf.TC.value_counts()
    
    Out[141]:
In [142]:
    
# Get categorical, object-type variables
dnsdf.select_dtypes(exclude=['object']).describe()
    
    Out[142]:
In [143]:
    
dnsdf.Z.value_counts()
    
    Out[143]:
dnsdf TODOs:
- drop 'ts' column
- drop 'tc' column, 1 unique
In [145]:
    
dnsdf.drop('ts', axis=1, inplace=True)
dnsdf.drop('TC', axis=1, inplace=True)
dnsdf.head()
    
    
In [16]:
    
filesdf.head()
    
    Out[16]:
In [147]:
    
filesdf.info()
    
    
In [148]:
    
# Get categorical, object-type variables
filesdf.select_dtypes(include=['object']).describe()
    
    Out[148]:
In [149]:
    
# Get numerical, object-type variables
filesdf.select_dtypes(exclude=['object']).describe()
    
    Out[149]:
In [150]:
    
filesdf.overflow_bytes.value_counts()
    
    Out[150]:
filesdf TODOs:
- drop 'ts' column
- drop 'extracted' column, 1 unique
- drop 'local_orig' column, 1 unique
- drop 'parent_fuid' column, 1 unique
- drop 'sha256' column, 1 unique
- drop 'overflow_bytes', column, 1 unique
In [151]:
    
filesdf.drop('ts', axis=1, inplace=True)
filesdf.drop('extracted', axis=1, inplace=True)
filesdf.drop('local_orig', axis=1, inplace=True)
filesdf.drop('parent_fuid', axis=1, inplace=True)
filesdf.drop('sha256', axis=1, inplace=True)
filesdf.drop('overflow_bytes', axis=1, inplace=True)
filesdf.head()
    
    Out[151]:
In [18]:
    
ftpdf.head(5)  # there are only 3 records
    
    Out[18]:
In [152]:
    
ftpdf.info()
    
    
In [153]:
    
# Get categorical, object-type variables
ftpdf.select_dtypes(include=['object']).describe()
    
    Out[153]:
In [154]:
    
# Get numerical, object-type variables
ftpdf.select_dtypes(exclude=['object']).describe()
    
    Out[154]:
ftpdf TODOs:
- drop 'ts' column
- drop 'file_size' column, 1 unique
- drop 'id.orig_h' column, 1 unique
- drop 'id.resp_h' column, 1 unique
- drop 'password' column, 1 unique
- drop 'user', column, 1 unique
In [155]:
    
ftpdf.drop('ts', axis=1, inplace=True)
ftpdf.drop('file_size', axis=1, inplace=True)
ftpdf.drop('id.orig_h', axis=1, inplace=True)
ftpdf.drop('id.resp_h', axis=1, inplace=True)
ftpdf.drop('password', axis=1, inplace=True)
ftpdf.drop('user', axis=1, inplace=True)
ftpdf.head()
    
    Out[155]:
In [174]:
    
httpdf.head(5)
    
    Out[174]:
In [156]:
    
httpdf.info()
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [20]:
    
noticedf.head()
    
    Out[20]:
In [21]:
    
sigdf.head() # only 1 record
    
    Out[21]:
In [24]:
    
smtpdf.head(5)
    
    Out[24]:
In [25]:
    
ssldf.head()
    
    Out[25]:
In [26]:
    
tunneldf.head()  # 2 records
    
    Out[26]:
In [29]:
    
weirddf.head(5)
    
    Out[29]:
In [ ]:
    
### Save DFs as
    
In [35]:
    
## Connections DF
    
In [37]:
    
conndf.sort_index(inplace=True)
conndf.info()
    
    
In [40]:
    
conndf.index
    
    Out[40]:
In [39]:
    
conndf.plot(y='duration')
    
    
In [61]:
    
import matplotlib
import matplotlib.pyplot
%matplotlib inline
import seaborn
    
In [157]:
    
conndf.sort_index().index
    
    Out[157]:
In [158]:
    
conndf.shape
    
    Out[158]:
In [160]:
    
hourly = conndf.groupby(pd.Grouper(freq='H')).count()
daily = hourly.groupby(pd.Grouper(freq='D')).mean()
    
In [ ]:
    
sns.countplot(x=)
    
In [176]:
    
hourly['2012':'2012'].uid.plot(kind='line', figsize=(15,5))
    
    Out[176]:
    
In [170]:
    
    
    Out[170]:
In [172]:
    
# Plot the average value by condition and date
ax = df.groupby(["threat",pd.Grouper(freq='H')]).count().plot()
    
    
In [177]:
    
### Countplots
    
In [181]:
    
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
    
In [179]:
    
conndf.select_dtypes(include=['object']).describe()
    
    Out[179]:
In [184]:
    
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="conn_state", data=conndf)
plt.xticks(rotation=90)
    
    Out[184]:
    
In [186]:
    
fig, ax = plt.subplots(figsize=(5, 5))
sns.countplot(ax=ax, x="proto", data=conndf)
plt.xticks(rotation=90)
    
    Out[186]:
    
In [187]:
    
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="service", data=conndf)
plt.xticks(rotation=90)
    
    Out[187]:
    
In [188]:
    
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="threat", data=conndf)
plt.xticks(rotation=90)
    
    Out[188]:
    
In [ ]:
    
#### dns.log
    
In [ ]:
    
dnsdf.select_dtypes(include=['object']).describe()